MAGs clustered as Dictyochophyceae or putative stramenopiles isolated as a case study to demonstrate how we can use metatranscriptomic data to explore the ecological niches of stramenopiles. Putative stramenopile MAGs were assigned to the broader SAR group (Stramenopiles, Alveolata, and Rhizaria) based on MMSeqs and EUKulele (see Figure 2), but one phylogenetically similar clade was chosen to represent putative heterotrophic stramenopiles likely related to Dictyochophyceae. Core questions to address:
(1) What is the biogeography of these MAGs with respect to region, depth, and size fraction? (2) How does the distribution of MAGs related to predicted trophic mode? (3) What are the core ecological differences among these taxa? (4) Are the currently SAR-assigned MAGs related to the more heterotrophic Dictyochophyceae species?
library(tidyverse); library(broom); library(ggtext); library(directlabels)
library(ggupset); library(cowplot); library(pheatmap)Import and clean metadata for all MAGs.
metaT_metadata <- read.delim("../data/SampleList_2020_metaT.txt")
metaG_metadata <- read.delim("../data/SampleList_2020_metaG.txt")
# Import Lat & Long
tmp_metadata <- read.delim("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/PRJEB4352_metaG_wenv.txt")# Separate ERRs into rows & keep only what I need
metaG_reformat <- metaG_metadata %>%
separate_rows(ERR_list, sep = ", ") %>%
separate(Sub_region, c("REGION", "subregion"), sep = "-", remove = FALSE) %>%
separate(Depth_sizefrac, c("DEPTH", "min", "max"), sep = "-", remove = FALSE) %>%
unite(SIZEFRAC, min, max, sep = "-") %>%
select(-Assembly_group, -ERR_count, run_accession = "ERR_list", REGION, DEPTH, SIZEFRAC) %>%
left_join(tmp_metadata %>% select(run_accession, sampleid = Sample.material, Longitude, Latitude, Station)) %>%
distinct()24 MAGs selected by membership to Dictyochophyceae and closely related heterotroph-predicted stramenopile clade.
mags_tmp <- read.delim("../data/mags-of-interest-24.txt", header = FALSE)
mags_of_interest <- as.character(mags_tmp$V1);mags_of_interest## [1] "TOPAZ_SAS1_E003" "TOPAZ_MSS1_E028" "TOPAZ_IOS1_E045" "TOPAZ_MSS1_E004"
## [5] "TOPAZ_SAS1_E036" "TOPAZ_SPS1_E066" "TOPAZ_SAS1_E034" "TOPAZ_SPS1_E018"
## [9] "TOPAZ_SPS1_E054" "TOPAZ_MSS1_E011" "TOPAZ_NPS1_E005" "TOPAZ_NAS1_E036"
## [13] "TOPAZ_NAX1_E011" "TOPAZ_MSS1_E023" "TOPAZ_NPS1_E025" "TOPAZ_SPS1_E074"
## [17] "TOPAZ_SPS1_E030" "TOPAZ_MSD1_E003" "TOPAZ_MSS1_E026" "TOPAZ_NAS1_E023"
## [21] "TOPAZ_SAD1_E033" "TOPAZ_SPD1_E019" "TOPAZ_SPS1_E070" "TOPAZ_SAS1_E035"
n = 13 from SAR n = 11 are dictyochophyte MAGs (derived from EUKulele & MMSeqs taxonomy assignment)
Import MAG files, including a name schematic key for the MAG TOPAZ-based naming. Also add the estimated taxonomy assignments.
mag_counts_updated <- read.csv("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/MAG_tpm.csv")
mag_rename_key <- read.delim("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/renamed-eukaryotic-mags.tsv", sep = "\t")
# Import MAG taxonomic assignments (Feb 2, 2021)
mag_taxonomy <- read.csv("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/2021-marzoanmmetsp-estimated-taxonomy-levels.csv")
# head(mag_taxonomy)Import list of MAGs with EUKulele taxonomy assignment at lower threshold (n=33); this way it includes more MAGs with dictyochophyte assignments. Incorporate BUSCO completeness and contamination to pare down the list.
euk_thres <- read.delim("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/list_dictyochophyceae.txt", header = FALSE)
buscos <- read.csv("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/EUK_BUSCO_CC.csv")Isolate TPM MAG abundances.
mag_tpm <- mag_counts_updated %>%
left_join(mag_rename_key, by = c("Genome" = "old_mag_name")) %>%
filter(new_mag_name %in% mags_of_interest) %>%
pivot_longer(cols = starts_with("ERR"), names_to = "run_accession", values_to = "tpm") %>%
left_join(metaG_reformat) %>%
select(mag = new_mag_name, -Genome, everything()) %>%
data.frame## Joining, by = "run_accession"
# head(mag_tpm)
# length(unique(mag_tpm$mag))Sum TPM of MAGs based on Size Fraction, Location, and Depth
mag_tpm_location <- mag_tpm %>%
filter(!is.na(REGION)) %>%
group_by(mag, DEPTH, REGION, SIZEFRAC) %>%
summarise(mag_tpm = sum(tpm)) %>%
mutate(log_mag_tpm = log(mag_tpm)) %>%
unite(SAMPLE, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
filter(!is.na(mag_tpm))## `summarise()` has grouped output by 'mag', 'DEPTH', 'REGION'. You can override using the `.groups` argument.
# head(mag_tpm_location)
hist(mag_tpm_location$log_mag_tpm)Save output R objects as data checkpoints to add to available repo.
# save(mag_tpm_location, mag_tpm, file = "../data/MAG-casestudy-Stramenopile-2021.RData")Load existing data from R objects.
load("../data/MAG-casestudy-Stramenopile-2021.RData", verbose = TRUE)## Loading objects:
## mag_tpm_location
## mag_tpm
Visualize MAGs with pheatmap
pheat_mags <- mag_tpm_location %>%
select(mag, SAMPLE, log_mag_tpm) %>%
filter(!is.na(log_mag_tpm)) %>%
mutate_if(is.numeric, function(x) ifelse(is.infinite(x), 0, x)) %>%
pivot_wider(id_cols = SAMPLE, names_from = mag, values_from = log_mag_tpm, values_fill = 0) %>%
column_to_rownames(var = "SAMPLE")## Adding missing grouping variables: `DEPTH`, `REGION`
## `mutate_if()` ignored the following grouping variables:
## Columns `mag`, `DEPTH`, `REGION`
# Obtain annotation
annotate_mags <- mag_tpm_location %>%
ungroup() %>%
filter(!is.na(REGION)) %>%
select(SAMPLE, REGION, DEPTH, SIZEFRAC) %>%
distinct() %>%
column_to_rownames(var = "SAMPLE")
rownames(annotate_mags) <- rownames(pheat_mags)
# Specify color schema
annotation_colors = list(
REGION = c(IO= "#711518", MS= "#ce536b", NAO= "#c76b4a", NPO= "#dfa837", SAO= "#93b778", SO= "#61ac86", SPO= "#657abb", RS= "#67765b"),
DEPTH = c(DCM= "#74a9cf", FSW= "#969696", SRF= "#d0d1e6", MIX= "#0570b0", MES= "#023858", ZZZ= "#252525"),
SIZEFRAC = c(`0.8-5.00` = "#f7fcb9",`180-2000.00` = "#004529", `20-180.00` = "#41ab5d",`5-20.00` = "#addd8e"))MAG relative abundace by sample and TPM.
# png("../si-figures/SI-SAR-Dictyocho-MAG-MAG-relabun-bysample.png", h=1100, w=800)
pheatmap(pheat_mags,
annotation_row = annotate_mags,
annotation_colors = annotation_colors,
scale = "none",
cluster_cols = TRUE,
cluster_rows = TRUE,
clustering_method = "average",
cellwidth = 10, cellheight = 10,
main = "MAG Relative Abundance (CPM)")# dev.off()Plot MAGs based on global distribution - TPM.
library(rworldmap)## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
Figure out latitute and longitude
mag_tpm_tmp <- mag_tpm %>%
mutate(lat_1 = round(Latitude, digits = 1),
long_1 = round(Longitude, digits = 1),
lat_2 = round(Latitude, digits = 2),
lat_3 = round(Latitude, digits = 3),
long_2 = round(Longitude, digits = 2),
long_3 = round(Longitude, digits = 3))
# length(unique(mag_tpm_tmp$lat_1))
# length(unique(mag_tpm_tmp$long_1))Make summary data frame by latitute and longitude.
mag_tpm_map <- mag_tpm_tmp %>%
filter(tpm > 0) %>%
filter(!is.na(REGION)) %>%
group_by(mag, DEPTH, REGION, SIZEFRAC, lat_1, long_1) %>%
summarise(mag_tpm_latlong = sum(tpm)) %>%
mutate(log_mag_tpm = log(mag_tpm_latlong)) %>%
unite(SAMPLE, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
filter(!is.na(mag_tpm_latlong)) %>%
mutate(Location = paste(lat_1, long_1))## `summarise()` has grouped output by 'mag', 'DEPTH', 'REGION', 'SIZEFRAC', 'lat_1'. You can override using the `.groups` argument.
Get background world map
map_get_world <- function(resolution="coarse"){
worldMap <- rworldmap::getMap(resolution = resolution) # Change to "coarse" for global maps / "low" for regional maps
world.points <- fortify(worldMap)
world.points$region <- world.points$id
world.df <- world.points[,c("long","lat","group", "region")]
}
# ?rworldmap::getMap
map_world <- function(color_continents = "#969696", color_borders = "#969696", resolution = "coarse") {
world.df <- map_get_world(resolution)
map <- ggplot() +
geom_polygon(data = world.df, aes(x=long, y = lat, group = group), fill=color_continents, color=color_borders) +
# scale_fill_manual(values= color_continents , guide = FALSE) +
scale_x_continuous(breaks = (-4:4) * 45) +
scale_y_continuous(breaks = (-2:2) * 30) +
xlab("Longitude") + ylab("Latitude") +
coord_fixed(1.3) +
theme_bw()
# species_map <- species_map + coord_map () # Mercator projection
# species_map <- species_map + coord_map("gilbert") # Nice for the poles
return(map)
}Factor for depths for the map.
depth_order <- c("DCM", "FSW", "SRF", "MIX", "MES", "ZZZ")
depth_color <- c("#74a9cf", "#969696", "#d0d1e6", "#0570b0", "#023858", "#252525")
mag_tpm_map$DEPTH_ORDER <- factor(mag_tpm_map$DEPTH, levels = depth_order)
#
mag_order <- c("TOPAZ_MSS1_E028","TOPAZ_SAS1_E003","TOPAZ_NAX1_E011","TOPAZ_IOS1_E045","TOPAZ_SAS1_E036","TOPAZ_NAS1_E036","TOPAZ_MSS1_E004","TOPAZ_SAD1_E033","TOPAZ_MSS1_E023","TOPAZ_NPS1_E025","TOPAZ_MSS1_E011","TOPAZ_NPS1_E005","TOPAZ_SPS1_E018","TOPAZ_SPS1_E054","TOPAZ_MSD1_E003","TOPAZ_MSS1_E026","TOPAZ_SAS1_E035","TOPAZ_NAS1_E023","TOPAZ_SPS1_E030","TOPAZ_SPS1_E074","TOPAZ_SPD1_E019","TOPAZ_SPS1_E070","TOPAZ_SAS1_E034","TOPAZ_SPS1_E066")
mag_tpm_map$MAG_ORDER <- factor(mag_tpm_map$mag, levels = mag_order)# png("../si-figures/SI-stramenopile-MAG-map.png", w=1100,h=850)
map_world() +
geom_jitter(data = mag_tpm_map,
aes(x = long_1, y = lat_1,
fill = DEPTH_ORDER,
size = mag_tpm_latlong),
shape = 21, color = "black") +
scale_size_continuous(range = c(1,14)) +
scale_fill_manual(values = depth_color) +
facet_wrap(MAG_ORDER ~ ., ncol = 4) +
theme(strip.background = element_blank(),
strip.text = element_text(face = "bold", size = 14, hjust = 1, vjust = 1),
axis.text = element_text(size = 10, hjust = 1, vjust = 1),
legend.title = element_blank())## Regions defined for each Polygons
## Warning: Removed 24 rows containing missing values (geom_point).
# dev.off()Import the results from metatranscriptome mapping onto putative dictyochophyte and SAR group MAGs. Explore trophic roles and shared/unique functional profiles.
For loop to determine summed transcript counts that hit the 11 core putative dictyochophyte MAGs.
# For loop to import all data and compile
summed_counts <- list.files(path = "/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data",pattern = "tpm.summedcounts")
# summed_counts
# rm(imported)
# rm(compiled_metaT_hits)
for (a in summed_counts) {
imported <- read.csv(paste("/Users/sarahhu/Desktop/Projects/eukheist_proj/dictyochophyceae-analysis/input-data/", a, sep = ""))
names <- unlist(strsplit(a,".tpm.summedcounts"))
imported$MAG <- names[1]
if (!exists("compiled_metaT_hits")) {
compiled_metaT_hits <- imported
} else {
compiled_metaT_hits <- rbind(compiled_metaT_hits, imported)
}
rm(imported)
}
# rm(compiled_metaT_hits)
# rm(imported)# Rename MAGs with TOPAZ name schematic
metaT_mapped <- compiled_metaT_hits %>%
left_join(mag_rename_key, by = c("MAG" = "old_mag_name")) %>%
select(-MAG) %>%
select(MAG = new_mag_name, everything()) %>%
data.frame
# head(metaT_mapped)
# dim(compiled_metaT_hits)Checkpoint for saving the metatranscriptome data with TOPAZ naming schematic.
# save(metaT_mapped, mag_rename_key, file = "../data/MAG-casestudy-Stramenopile-metaT-2021.RData")Match KEGG identities to kegg module information and curated list of kegg pathways of interest.
Import metatranscriptome data from MAGs of interest.
load("../data/MAG-casestudy-Stramenopile-metaT-2021.RData", verbose=T)## Loading objects:
## metaT_mapped
## mag_rename_key
trophy <- read.csv("../data/mag_heterotrophy_rf_13May2021.csv")
# head(mag_rename_key)
# head(trophy)
trophy_refine <- trophy %>%
left_join(mag_rename_key, by = c("MAGs" ="old_mag_name")) %>%
select(-MAGs)
# head(trophy_refine)
# mag_order
# write_delim(filter(trophy_refine, new_mag_name %in% mag_order) %>% distinct(), path = "trophy-24mags.txt", delim = "\t")metaT_mapped_trophy <- metaT_mapped %>%
left_join(trophy_refine, by = c("MAG" = "new_mag_name"))
# Get list of phototrophy predicted MAGs
tmp <- metaT_mapped_trophy %>%
filter(PredictedTrophicMode == "Phototroph")
photo_mags <- as.character(unique(tmp$MAG))
# length(photo_mags)
# head(metaT_mapped)MAG_dendro_binary <- metaT_mapped %>%
type.convert(as.is = TRUE) %>%
select(MAG, KO) %>%
add_column(BIN = 1) %>%
pivot_wider(names_from = "KO", values_from = BIN, values_fill = 0) %>%
column_to_rownames(var = "MAG") %>%
data.frame
# ?pivot_wider
# head(MAG_dendro_binary)library(ggdendro)
cluster_mags <- hclust(dist((MAG_dendro_binary)), method = "average")
dendro_mags <- dendro_data(as.dendrogram(cluster_mags), type = "rectangle")
# head(dendro_mags)
labeled_dendro_order <- as.character(dendro_mags$labels$label)
# labeled_dendro_orderAdd trophy information
labs <- label(dendro_mags)
labs_refined <- labs %>%
mutate(group = case_when(
label %in% photo_mags ~ "Phototroph",
TRUE ~ "Heterotroph"
))
# head(labs_refined)Plot dendrogram - derived from the presence/absence of known orthologs.
dendro_mag_plot <- ggplot(segment(dendro_mags)) +
geom_segment(aes(x = x, y = y, xend = xend,
yend = yend)) +
coord_flip() +
scale_y_reverse(expand = c(1, 1)) +
geom_text(aes(x = x, y = y, label = label, angle = 0,
hjust = 0, color = labs_refined$group),
data = label(dendro_mags), fontface = "bold", size = 5) +
scale_color_manual(values = c("#CC3D6D", "#B4CC3D")) +
theme_dendro() +
theme(legend.position = "bottom", legend.title = element_blank())# svg("dendrogram-plot-trophy.svg", w=7, h=10)
dendro_mag_plot# dev.off()mag_pie_order <- rev(labeled_dendro_order)Prepare data frame and factor.
region_mags_df <- mag_tpm_location %>%
replace_na(list(mag_tpm=0)) %>%
group_by(REGION, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))## `summarise()` has grouped output by 'REGION'. You can override using the `.groups` argument.
region_order <- c("IO","MS","NAO","NPO","SAO","SO","SPO","RS")
region_color <- c("#711518", "#ce536b", "#c76b4a", "#dfa837", "#93b778", "#61ac86", "#657abb", "#67765b")
region_mags_df$REGION_ORDER <- factor(region_mags_df$REGION, levels = region_order)
region_mags_df$MAG_ORDER <- factor(region_mags_df$mag, levels = mag_pie_order)
names(region_color) <- region_order
# View(region_dictyo)Plot pies by region
region_mag <- region_mags_df %>%
add_column(fake = "fake") %>%
ggplot(aes(x = fake, fill = REGION_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity", position = "fill") +
scale_fill_manual(values = region_color) +
# theme_linedraw() +
coord_polar(theta = "y") +
# coord_flip() +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 3),
# plot.margin = element_blank(),
plot.background = element_blank()) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs") #+
# theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
# panel.margin.y = unit(-2, "lines"))
# region_magPrepare data frame and factor.
depth_mags_df <- mag_tpm_location %>%
replace_na(list(mag_tpm=0)) %>%
group_by(DEPTH, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))## `summarise()` has grouped output by 'DEPTH'. You can override using the `.groups` argument.
depth_order <- c("DCM", "FSW", "SRF", "MIX", "MES", "ZZZ")
depth_color <- c("#74a9cf", "#969696", "#d0d1e6", "#0570b0", "#023858", "#252525")
depth_mags_df$DEPTH_ORDER <- factor(depth_mags_df$DEPTH, levels = depth_order)
depth_mags_df$MAG_ORDER <- factor(depth_mags_df$mag, levels = mag_pie_order)
names(depth_color) <- depth_order
# View(depth_dictyo)Plot pies by depth
depth_mag <- depth_mags_df %>%
add_column(fake = "fake") %>%
ggplot(aes(x = fake, fill = DEPTH_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity", position = "fill") +
scale_fill_manual(values = depth_color) +
# theme_linedraw() +
coord_polar(theta = "y") +
# coord_flip() +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 4)) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs")Prepare data frame and factor.
frac_mags_df <- mag_tpm_location %>%
replace_na(list(mag_tpm=0)) %>%
group_by(SIZEFRAC, mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))## `summarise()` has grouped output by 'SIZEFRAC'. You can override using the `.groups` argument.
frac_order <- c("0.8-5.00", "5-20.00", "20-180.00", "180-2000.00")
frac_color <- c("#f7fcb9", "#addd8e", "#41ab5d", "#004529")
frac_mags_df$FRAC_ORDER <- factor(frac_mags_df$SIZEFRAC, levels = frac_order)
frac_mags_df$MAG_ORDER <- factor(frac_mags_df$mag, levels = mag_pie_order)
names(frac_color) <- frac_order
# View(frac_dictyo)Plot pies by size fraction
frac_mag <- frac_mags_df %>%
add_column(fake = "fake") %>%
ggplot(aes(x = fake, fill = FRAC_ORDER, y = mag_sum_tpm)) +
geom_bar(color = "white", stat = "identity", position = "fill") +
scale_fill_manual(values = frac_color) +
# theme_linedraw() +
coord_polar(theta = "y") +
# coord_flip() +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 4)) +
labs(y = "MAG Abundance TPM", x = "Putative Dictyochophyte MAGs")Prepare data frame and factor.
head(mag_tpm_location)## # A tibble: 6 x 7
## # Groups: mag, DEPTH, REGION [2]
## mag SAMPLE DEPTH REGION SIZEFRAC mag_tpm log_mag_tpm
## <fct> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 TOPAZ_IOS1_E045 IO DCM 0.8-5.00 DCM IO 0.8-5.00 8598. 9.06
## 2 TOPAZ_IOS1_E045 IO DCM 180-2000.00 DCM IO 180-2000.… 2176. 7.69
## 3 TOPAZ_IOS1_E045 IO DCM 20-180.00 DCM IO 20-180.00 3185. 8.07
## 4 TOPAZ_IOS1_E045 IO DCM 5-20.00 DCM IO 5-20.00 162. 5.09
## 5 TOPAZ_IOS1_E045 MS DCM 0.8-5.00 DCM MS 0.8-5.00 1648. 7.41
## 6 TOPAZ_IOS1_E045 MS DCM 180-2000.00 DCM MS 180-2000.… 1950. 7.58
total_relabun_mag <- mag_tpm_location %>%
replace_na(list(mag_tpm=0)) %>%
group_by(mag) %>%
summarise(mag_sum_tpm = sum(mag_tpm))
total_relabun_mag$MAG_ORDER <- factor(total_relabun_mag$mag, levels = mag_pie_order)rel_abun <- total_relabun_mag %>%
add_column(fake = "fake") %>%
ggplot(aes(x = fake, size = mag_sum_tpm, y = mag)) +
geom_point(color = "black") +
scale_size_continuous(range = c(1, 17)) +
facet_wrap(~MAG_ORDER, ncol = 1) +
# scale_y_continuous(expand = c(0,0)) +
theme_void() +
theme(legend.title = element_blank(),
strip.text = element_text(size = 4))Compiled plots. Same order as the dendrogram above, so these pie charts can be lined up with the dendrogram.
# svg("pies-mag-abun-withoutlabels.svg", h = 20, w = 15)
# plot_grid(region_mag, depth_mag, frac_mag, rel_abun, nrow = 1)
plot_grid(region_mag + theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
panel.margin.y = unit(-2, "lines")),
depth_mag+ theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
panel.margin.y = unit(-2, "lines")),
frac_mag+ theme(strip.text = element_text(size = rel(3.0), vjust = -4.0),
panel.margin.y = unit(-2, "lines")),
rel_abun,
nrow = 1)## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
# dev.off()First, incorporate KEGG module information into mapped metatranscriptome data.
keggs_to_match <- select(metaT_mapped, KO) %>% distinct()
# head(keggs_to_match)Import provided KEGG information (module information) and list of KEGG IDs selected to model trophic mode.
kegg_vita <- read.csv("../data/vita_vars.csv")
load(file = "../data/KEGG-key.RData", verbose = TRUE)## Loading objects:
## kegg
## K0_gene_wMods
kegg_key_dictyo <- keggs_to_match %>%
left_join(kegg_vita %>% add_column(VITA = "vita"), by = c("KO" = "Selected.KOs")) %>%
left_join(kegg, by = c("KO" = "KO_number")) %>%
data.frame
# head(kegg_key_dictyo)
# dim(keggs_to_match);dim(kegg_key_dictyo)Supplementary figure showing shared and unique metatranscriptome reads among selected MAGs.
# png("../si-figures/SI-SAR-Dictyocho-MAG-KEGG-intersection.png", h=1000, w=900)
metaT_mapped_trophy %>%
type.convert(as.is = TRUE) %>%
pivot_longer(cols = starts_with("ERR"), names_to = "SAMPLEID", values_to = "transcript_tpm") %>%
left_join(kegg_key_dictyo %>% select(KO, B) %>% distinct()) %>%
group_by(MAG, KO, B) %>%
summarise(transcript_tpm_sum = sum(transcript_tpm)) %>%
ungroup() %>%
group_by(KO, B) %>%
summarise(transcript_to_mag = list(MAG)) %>%
ggplot(aes(transcript_to_mag)) +
# geom_bar(color = "black", fill = "#AC7070", width = 0.7) +
geom_bar(color = "black", width = 0.7, aes(fill = B)) +
scale_fill_brewer(palette = "Dark2") +
scale_x_upset(order_by = "freq",
n_intersections = 30, position = "top") +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
theme_combmatrix(combmatrix.label.text = element_text(color = "black", size = 14),
combmatrix.label.extra_spacing = 10) +
theme(axis.text = element_text(color = "black", face = "bold", size = 12),
legend.title = element_blank()) +
labs(x = "Frequency across MAGs",
y = "Shared KEGG IDs",
title = "")## Joining, by = "KO"
## `summarise()` has grouped output by 'MAG', 'KO'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'KO'. You can override using the `.groups` argument.
## Warning: Removed 3805 rows containing non-finite values (stat_count).
# dev.off()Following selection of specific MAGs that may represent phototrophic dictyochophyceae, mixotrophic dictyochophyceae, and MAST - look at the relative TPM expression of these core gene set (see above section for selection) across different ocean regions and depths.
Select MAGs of interest
mags_to_highlight <- c("TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011")
library(gghighlight)# mags_to_highlight <- c("TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011")
fxn_pca_highlightMAG <- function(mag){
ggplot(pca_shared_all, aes(x = PC1, y = PC2, size = HeterotrophyScore, fill = MAG)) +
geom_point(aes(fill = MAG), shape = 21,
stroke = 0.6) +
scale_size_binned(nice.breaks = TRUE, n.breaks = 8, range = c(0.4,8)) +
theme_bw() +
scale_fill_manual(values = c("#fb6a4a")) +
theme(axis.text = element_text(size = 10, color = "black")) + guides(fill = guide_legend(ncol = 2)) +
gghighlight(MAG == mag)
}
# svg("highlight-pca.svg", h = 14, w = 5.5)
# plot_grid(fxn_pca_highlightMAG("TOPAZ_MSS1_E011"),
# fxn_pca_highlightMAG("TOPAZ_SAS1_E035"),
# fxn_pca_highlightMAG("TOPAZ_SPS1_E066"),
# fxn_pca_highlightMAG("TOPAZ_NAX1_E011"),
# ncol = 1, nrow = 4, labels = c("Phototroph predicted TOPAZ_MSS1_E011",
# "Heterotroph predicted TOPAZ_SAS1_E035",
# "Putative mixotroph TOPAZ_SPS1_E066",
# "Putative mixotroph TOPAZ_NAX1_E011"),
# label_size = 8)
# dev.off()Subset genes that are more prominent in certain categories. While the intention is to isolate genes that may be indicative of a more phototrophic versus heterotrophic lifestyle, it is likely that there will be a gene suite that is more prominent in one of those categories. BUT as a whole, analysis of all the genes will be the most appropriate avenue for determining potential trophic strategies.
Biogeography of hypothesized MAST & Dictyochophyceae MAGs Following selection of specific MAGs that may represent phototrophic dictyochophyceae, mixotrophic dictyochophyceae, and MAST - look at the relative TPM expression of these core gene set (see above section for selection) across different ocean regions and depths.
Generate lollipop plot of metaT CPM derived from more phototrophy vs heterotrophy associated KEGG IDs. Map these across samples.
trophy_contri <- read.csv("../data/kegg_training_scores.csv")
# head(trophy_contri)Taking input scores for vita-selected KEGG IDs, and working to distinguish if each ID contributes to phototrophy versus heterotrophy.
vita_keggs_list <- as.character(unique(kegg_vita$Selected.KOs))
trophy_contri_mod <- trophy_contri %>%
select(Predicted.trophic.mode, KEGG_ID_NUM, Ratio) %>%
pivot_wider(names_from = Predicted.trophic.mode, values_from = Ratio) %>%
mutate(VITA = case_when(
KEGG_ID_NUM %in% vita_keggs_list ~ "vita"
)) %>%
# Placement based on whichever has highest ratio
mutate(HIGHEST = case_when(
Heterotroph > Phototroph ~ "Heterotrophy",
TRUE ~ "Phototrophy"
),
# Placement based on the ID having a >0.6 ratio
THRES_0.6 = case_when(
Heterotroph >= 0.6 ~ "Heterotrophy",
Phototroph >= 0.6 ~ "Phototrophy",
TRUE ~ "else"
),
# Placement based on the ID having a >0.8 ratio
THRES_0.8 = case_when(
Heterotroph >= 0.8 ~ "Heterotrophy",
Phototroph >= 0.8 ~ "Phototrophy",
TRUE ~ "else"
))
# head(trophy_contri_mod)
# table(trophy_contri_mod$HIGHEST)
# table(trophy_contri_mod$THRES_0.8)# MetaT mapped to MAGs of interest, summed (CPM) to region, depth, and size fraction
sum_metaT_by_region <- metaT_mapped_trophy %>%
pivot_longer(cols = starts_with("ERR"), names_to = "run_accession", values_to = "transcript_tpm") %>%
left_join(metaT_reformat) %>%
group_by(MAG, KO, REGION, DEPTH, SIZEFRAC, PredictedTrophicMode, HeterotrophyScore) %>%
summarise(SUM = sum(transcript_tpm)) %>%
unite(MAG_troph, MAG, PredictedTrophicMode, sep = " ", remove = FALSE)## Joining, by = "run_accession"
## `summarise()` has grouped output by 'MAG', 'KO', 'REGION', 'DEPTH', 'SIZEFRAC', 'PredictedTrophicMode'. You can override using the `.groups` argument.
Isolate specific MAGs to plot
mags_to_highlight <- c("TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011", "TOPAZ_IOS1_E045", "TOPAZ_SAS1_E003", "TOPAZ_NPS1_E025", "TOPAZ_MSS1_E023")
depths <- c("SRF", "DCM")
subset_lolli_input <- sum_metaT_by_region %>%
type.convert(as.is = TRUE) %>%
filter(MAG %in% mags_to_highlight) %>%
filter(DEPTH %in% depths) %>%
unite(MAG_troph, MAG, PredictedTrophicMode, HeterotrophyScore, sep = " ", remove = FALSE) %>%
left_join(trophy_contri_mod, by = c("KO" = "KEGG_ID_NUM")) %>%
unite(SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
group_by(SAMPLE_INFO, MAG) %>% #relative abundance is based on...
mutate(TOTAL_MAG_TPM = sum(SUM)) %>%
ungroup() %>%
group_by(MAG, PredictedTrophicMode, MAG_troph, SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, HIGHEST) %>%
summarise(SUM_HIGHEST = sum(SUM),
REL_ABUN = SUM_HIGHEST/TOTAL_MAG_TPM) %>%
#Use Relative abundance
mutate(VALUE = case_when(
HIGHEST == "Phototrophy" ~ (-1)*REL_ABUN,
HIGHEST == "Heterotrophy" ~ REL_ABUN
)) %>%
select(TROPHY = HIGHEST, everything()) %>%
distinct()## `summarise()` has grouped output by 'MAG', 'PredictedTrophicMode', 'MAG_troph', 'SAMPLE_INFO', 'REGION', 'DEPTH', 'SIZEFRAC', 'HIGHEST'. You can override using the `.groups` argument.
# Factorization
region_order <- c("IO","MS","NAO","NPO","SAO","SO","SPO","RS")
region_color <- c("#711518", "#ce536b", "#c76b4a", "#dfa837", "#93b778", "#61ac86", "#657abb", "#67765b")
frac_order <- c("0.8-5.00", "5-20.00", "20-180.00", "180-2000.00")
frac_color <- c("#f7fcb9", "#addd8e", "#41ab5d", "#004529")
depth_order <- c("DCM", "FSW", "SRF", "MIX", "MES", "ZZZ")
depth_color <- c("#74a9cf", "#969696", "#d0d1e6", "#0570b0", "#023858", "#252525")
subset_lolli_input$REGION_ORDER <- factor(subset_lolli_input$REGION, levels = region_order)
names(region_color) <- region_order
subset_lolli_input$DEPTH_ORDER <- factor(subset_lolli_input$DEPTH, levels = depth_order)
names(depth_color) <- depth_order
subset_lolli_input$SIZEFRAC_ORDER <- factor(subset_lolli_input$SIZEFRAC, levels = frac_order)
names(frac_color) <- frac_order# "TOPAZ_MSS1_E011","TOPAZ_SAS1_E035","TOPAZ_SPS1_E066","TOPAZ_NAX1_E011"
lolli_metaT <- subset_lolli_input %>%
filter(PredictedTrophicMode == "Phototroph") %>%
# filter(PredictedTrophicMode != "Phototroph") %>%
ggplot(aes(x = SAMPLE_INFO, y = VALUE, fill = TROPHY)) +
geom_hline(yintercept = 0) +
geom_segment(aes(x = SAMPLE_INFO, xend = SAMPLE_INFO,
y = 0, yend = VALUE, color = TROPHY),
lineend = "butt", size = 1) +
geom_point(size = 2, shape = 19, aes(color = TROPHY)) +
scale_color_manual(values = c("#CC3D6D", "#B4CC3D")) +
theme_bw() +
# scale_y_continuous(limits = c(-0.2,0.2)) +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ MAG_troph, space = "free", scales = "free_y") +
theme(axis.line.y = element_line(color = "black"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
strip.text.y = element_text(size = 10),
legend.position = "none") +
coord_flip() +
labs(x = "", y ="")
# lolli_metaTPlot with color bar (*note there is a bug in R that does not allow me to remove the strip text bar). Repeat below for phototrophic and heterotrophic subset of MAGs.
# svg("heterotrophs-lolli-29052021.svg", h=12, w=18)
# svg("phototrophs-lolli-29052021.svg", h=12, w=18)
plot_grid(lolli_metaT,
ggplot(subset_lolli_input, aes(x = SAMPLE_INFO, fill = SIZEFRAC_ORDER, y = 1)) +
geom_tile() +
scale_fill_manual(values = frac_color) +
theme_bw() +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ ., space = "free", scales = "free_y") +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
# strip.text = element_blank(),
legend.position = "none") +
coord_flip() +
labs(x = "", y =""),
##
ggplot(subset_lolli_input, aes(x = SAMPLE_INFO, fill = DEPTH_ORDER, y = 0.1)) +
geom_tile() +
scale_fill_manual(values = depth_color) +
theme_bw() +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ ., space = "free", scales = "free_y") +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
# strip.text = element_blank(),
legend.position = "none") +
coord_flip() +
labs(x = "", y =""),
##
ggplot(subset_lolli_input, aes(x = SAMPLE_INFO, fill = REGION_ORDER, y = 0.1)) +
geom_tile() +
scale_fill_manual(values = region_color) +
theme_bw() +
facet_grid(SIZEFRAC_ORDER + DEPTH_ORDER + REGION_ORDER ~ ., space = "free", scales = "free_y") +
theme(axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.spacing.x = unit(0, "lines"),panel.spacing.y = unit(0, "lines"),
panel.border = element_blank(),
panel.grid = element_blank(),
strip.background.x = element_blank(),
# strip.text = element_blank(),
legend.position = "none") +
coord_flip() +
labs(x = "", y =""),
align = "h", axis = "t", nrow = 1,
rel_widths = c(1,0.1, 0.1, 0.1))## Warning: Removed 224 rows containing missing values (geom_segment).
## Warning: Removed 224 rows containing missing values (geom_point).
# dev.off()
# ?plot_grid()Try x-y scatter plot on this
# head(subset_lolli_input)
# subset_lolli_input %>%
# filter(PredictedTrophicMode == "Phototroph") %>%
# # filter(PredictedTrophicMode != "Phototroph") %>%
# filter(!is.na(TROPHY)) %>%
# select(-SUM_HIGHEST, -VALUE) %>%
# pivot_wider(names_from = TROPHY, values_from = REL_ABUN) %>%
# ggplot(aes(x = Heterotrophy, y = Phototrophy, fill = MAG_troph)) +
# geom_point(shape = 21, size = 2, color = "black") +
# theme_linedraw() +
# # scale_y_continuous(limits = c(0,0.2)) +
# # scale_x_continuous(limits = c(0,0.2))
# scale_y_log10() + scale_x_log10()# head(sum_metaT_by_region)
# head(trophy_contri_mod)# head(sum_metaT_by_region)
# df_pheat_subset <- sum_metaT_by_region %>%
# type.convert(as.is = TRUE) %>%
# unite(MAG_troph, MAG, PredictedTrophicMode, HeterotrophyScore, sep = " ", remove = FALSE) %>%
# left_join(trophy_contri_mod, by = c("KO" = "KEGG_ID_NUM")) %>%
# filter(VITA == "vita") %>%
# unite(SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, sep = " ", remove = FALSE) %>%
# group_by(MAG, PredictedTrophicMode, MAG_troph, SAMPLE_INFO, REGION, DEPTH, SIZEFRAC, HIGHEST) %>%
# summarise(SUM_HIGHEST = sum(SUM)) %>%
# distinct()
# head(df_pheat_subset)sessionInfo()## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS/LAPACK: /Users/sarahhu/anaconda3/envs/r_3.6.0/lib/R/lib/libRblas.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gghighlight_0.3.1 ggdendro_0.1-20 rworldmap_1.3-6
## [4] sp_1.4-2 pheatmap_1.0.12 cowplot_1.0.0
## [7] ggupset_0.3.0 directlabels_2021.1.13 ggtext_0.1.0
## [10] broom_0.7.5 forcats_0.5.0 stringr_1.4.0
## [13] dplyr_1.0.5 purrr_0.3.4 readr_1.3.1
## [16] tidyr_1.1.3 tibble_3.1.0 ggplot2_3.3.1
## [19] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 maps_3.3.0 jsonlite_1.6.1
## [4] dotCall64_1.0-0 modelr_0.1.8 assertthat_0.2.1
## [7] highr_0.8 tensorA_0.36.1 blob_1.2.1
## [10] cellranger_1.1.0 robustbase_0.93-6 yaml_2.2.1
## [13] pillar_1.5.1 backports_1.2.1 lattice_0.20-41
## [16] glue_1.4.2 quadprog_1.5-8 digest_0.6.27
## [19] RColorBrewer_1.1-2 gridtext_0.1.3 rvest_0.3.5
## [22] colorspace_2.0-0 htmltools_0.5.1.1 pkgconfig_2.0.3
## [25] haven_2.3.1 scales_1.1.1 generics_0.1.0
## [28] farver_2.1.0 ellipsis_0.3.1 withr_2.4.1
## [31] cli_2.4.0 magrittr_2.0.1 crayon_1.4.1
## [34] readxl_1.3.1 maptools_1.0-1 evaluate_0.14
## [37] fs_1.4.1 fansi_0.4.2 MASS_7.3-51.6
## [40] compositions_1.40-5 xml2_1.3.2 foreign_0.8-76
## [43] tools_3.6.1 hms_0.5.3 lifecycle_1.0.0
## [46] munsell_0.5.0 reprex_0.3.0 compiler_3.6.1
## [49] rlang_0.4.10 debugme_1.1.0 grid_3.6.1
## [52] rstudioapi_0.11 spam_2.6-0 labeling_0.4.2
## [55] rmarkdown_2.6 gtable_0.3.0 DBI_1.1.0
## [58] R6_2.5.0 bayesm_3.1-4 lubridate_1.7.8
## [61] knitr_1.31 utf8_1.2.1 stringi_1.5.3
## [64] Rcpp_1.0.5 fields_11.6 vctrs_0.3.7
## [67] DEoptimR_1.0-8 dbplyr_1.4.4 tidyselect_1.1.0
## [70] xfun_0.20